# _*_coding:utf-8_*_
# Author:sy
# Created:2021/11/6 0006 11:18
# Version:1.0
from dianping.utils import *
import time
import random
from lxml import etree


class Spider(object):
    list_url = "https://www.dianping.com/{}/{}/p{}"

    proxy_url = ""

    @classmethod
    def run(cls, city_code, type_code, page):
        """
        运行爬虫
        :param city_code: 城市编码
        :param type_code: 类别编码
        :param page: 爬取页数
        :param cookie: 浏览器cookie
        :return:
        """
        # 准备代理
        proxy_util = ProxyUtils(proxy_url=cls.proxy_url)
        current_ip = proxy_util.get_proxy_ip()[0]
        all_data = []
        # 爬取对应的类别
        for code in type_code:
            error_count = 0
            for page_index in range(1, page + 1):
                if error_count > 10:
                    raise Exception("已连续失败10次，请切换Cookie信息")
                current_spider_url = cls.list_url.format(city_code, code, page_index)
                print("获取类型：{}的第{}数据：{}".format(code, page_index, current_spider_url))
                try:
                    response_text = RequestUtils.down_href(current_spider_url, current_ip)
                    error_count = 0
                except Exception as e:
                    print("下载链接{}失败，失败原因：{}".format(current_spider_url, str(e)))
                    time.sleep(random.randint(6, 9))
                    error_count = error_count + 1
                    current_ip = proxy_util.get_proxy_ip()[0]
                    continue
                # 验证请求内容是否有效
                if response_text.find('验证中心') > 0:
                    error_count = error_count + 1
                    print("当前IP{}访问大众点评需验证，正在切换IP,切换后继续".format(current_ip))
                    time.sleep(random.randint(6, 9))
                    current_ip = proxy_util.get_proxy_ip()[0]
                    continue
                print("HTML内容获取成功，开始解析内容")
                # 解析首页内容
                shop_list = cls.__parse_list_html_text(response_text, code)
                print("解析数据完成，解析结果：{}".format(shop_list))

                # 进一步获取店铺详情
                detail_error_count = 0
                for detail_index, shop in enumerate(shop_list):
                    if detail_error_count > 10:
                        raise Exception("已连续失败10次，请切换Cookie信息")
                    print("获取商铺【{}】地址和电话，访问地址：{}".format(shop["shopName"], shop["shopHref"]))
                    try:
                        response_text = RequestUtils.down_href(shop["shopHref"], current_ip)
                        detail_error_count = 0
                    except Exception as e:
                        print("下载链接{}失败，失败原因：{}".format(shop["shopHref"], str(e)))
                        time.sleep(random.randint(6, 9))
                        detail_error_count = detail_error_count + 1
                        current_ip = proxy_util.get_proxy_ip()[0]
                        continue
                    # 验证请求内容是否有效
                    if response_text.find('验证中心') > 0:
                        print("当前IP【{}】访问大众点评需验证，正在切换IP,切换后继续".format(current_ip))
                        time.sleep(random.randint(6, 9))
                        detail_error_count = detail_error_count + 1
                        current_ip = proxy_util.get_proxy_ip()[0]
                        continue

                    detail_data = cls.__parse_detail_html_text(response_text, code)
                    shop["shopAddress"] = detail_data["shopAddress"]
                    shop["shopPhone"] = detail_data["shopPhone"]
                    print("店铺具体信息解析成功:{}".format(shop))

                    if detail_index > 3:
                        break
                all_data.extend(shop_list)

        out_put_data = []
        for item in all_data:
            if "shopPhone" not in item.keys() and "shopAddress" not in item.keys():
                continue
            out_put_data.append(item)

        ExcelUtils.write_excel_data(out_put_data, "大众点评", "结果")

    @classmethod
    def __parse_list_html_text(cls, html_text, type_code):
        out_put_data = []
        if html_text is not None:
            # 解析单个页面
            html = etree.HTML(html_text)
            if type_code != "ch55":
                for li_tag in html.xpath('//*[@id="shop-all-list"]/ul/li'):
                    title = li_tag.xpath("./div[2]/div[1]/a/h4/text()")[0]
                    li_href = li_tag.xpath("./div[2]/div[1]/a/@href")[0]
                    shop_id = li_href.split("/")[-1]
                    active_arr = "|".join(li_tag.xpath('./div[3]/div/a/@title'))[1:]
                    out_put_data.append({
                        "shopId": shop_id,  # 店铺ID
                        "shopName": title,  # 店铺名称
                        "typeCode": type_code,  # 店铺类别编码
                        "shopHref": li_href,  # 店铺访问地址
                        "active": active_arr  # 店铺的优惠活动
                    })
            else:
                for li_tag in html.xpath('//*[@id="J_boxList"]/ul/li'):
                    #
                    title = li_tag.xpath("./div[1]/p[1]/a/text()")[0]
                    li_href = "https://www.dianping.com{}".format(li_tag.xpath("./div[1]/p[1]/a/@href")[0])
                    shop_id = li_href.split("/")[-1]

                    active_arr = "|".join(li_tag.xpath('./div[1]/div/a/@title'))[1:]
                    out_put_data.append({
                        "shopId": shop_id,  # 店铺ID
                        "shopName": title,  # 店铺名称
                        "typeCode": type_code,  # 店铺类别编码
                        "shopHref": li_href,  # 店铺访问地址
                        "active": active_arr  # 店铺的优惠活动
                    })
        return out_put_data

    @classmethod
    def __parse_detail_html_text(cls, html_text, type_code):
        """
        根据type_code去解析文本内容
        :param html_text:
        :param type_code:
        :return:
        """
        if type_code == "ch50":
            try:
                return cls.__parse_text_1(html_text)
            except Exception as e:
                return cls.__parse_text_4(html_text)
        if type_code == "ch30" or type_code == "ch35" or type_code == "ch45" or type_code == "ch95":
            return cls.__parse_text_2(html_text)
        if type_code == "ch75":
            return cls.__parse_text_3(html_text)
        if type_code == "ch55":
            return cls.__parse_text_4(html_text)

    @classmethod
    def __parse_text_1(cls, html_text):
        """
        解析ch50，或ch55文本内容
        :param html_text:
        :return:
        """
        html = etree.HTML(html_text)
        # 区
        region_tag = html.xpath('//*[@id="basic-info"]/div[2]/a/span/text()')
        if len(region_tag) != 0:
            region = region_tag[0].strip()
        else:
            region = ""
        # 地址
        address_tag = html.xpath('//*[@id="basic-info"]/div[2]/span[2]/text()')
        if len(address_tag) != 0:
            address = address_tag[0].strip()
        else:
            address = ""
        # 电话
        shop_tel_tag = html.xpath('//*[@id="basic-info"]/p[1]/span[2]/text()')
        if len(shop_tel_tag) != 0:
            shop_tel = shop_tel_tag[0].strip()
        else:
            shop_tel = ""

        return {
            "shopAddress": region + address,
            "shopPhone": shop_tel
        }

    @classmethod
    def __parse_text_2(cls, html_text):
        """
        解析页面内容加密数据
        :param html_text:
        :return:
        """
        # 准备工作，获取对应的字体
        FontUtils.down_font(html_text)

        if html_text is not None:
            html_text = re.sub(r"&#x(\w+?);", r"*\1*", html_text)
            html_text = html_text.replace("&nbsp;", "@@@@@@@@")

            html = etree.HTML(html_text)

            # 地址
            address = FontUtils.translate_tag(html, "/html/body/div[2]/div/div[2]/div[1]/div[2]/div/span")

            # 电话
            phone = FontUtils.translate_tag(html, "/html/body/div[2]/div/div[2]/div[1]/p")
            phone = phone.replace("电话：", "").replace("添加", "").replace("@@@@@@@@", ",").replace("该商户暂不收录点评", "")

            return {
                "shopAddress": address,
                "shopPhone": phone
            }

    @classmethod
    def __parse_text_3(cls, html_text):
        """
        ch75内容解析
        :param html_text:
        :return:
        """
        if html_text is not None:
            html = etree.HTML(html_text)

            # 地址
            address = ''.join(
                html.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/div[2]/div[2]/text()')) \
                .replace(" ", '').replace("\n", "")
            # 电话
            phone_tag = html.xpath(
                "/html/body/div[3]/div/div[1]/div[1]/div[2]/div[2]/div[4]/div/span/@data-phone")
            if len(phone_tag) == 0:
                phone_tag = html.xpath(
                    "/html/body/div[3]/div/div[1]/div[1]/div[2]/div[2]/div[3]/div/span/@data-phone")
            if len(phone_tag) != 0:
                tele_phone = phone_tag[0].strip()
            else:
                tele_phone = ""
            return {
                "shopAddress": address,
                "shopPhone": tele_phone
            }

    @classmethod
    def __parse_text_4(cls, html_text):
        if html_text is not None:
            html = etree.HTML(html_text)

            # 地址
            address_tag = html.xpath(
                "/html/body/div[4]/div[2]/div[1]/div[2]/div[1]/div/div[3]/div/span[2]/text()")
            if len(address_tag) == 0:
                address_tag = html.xpath("/html/body/div[5]/div[2]/div[2]/div/div[2]/div/span/text()")
            if len(address_tag) == 0:
                address_tag = html.xpath("/html/body/div[5]/div[1]/div[1]/div/div[3]/dl[1]/dd/span[1]/text()")
            if len(address_tag) == 0:
                address_tag = html.xpath("/html/body/div[5]/div[1]/div[2]/div/div[3]/div[1]/span/text()")
            if len(address_tag) == 0:
                address_tag = html.xpath("/html/body/div[5]/div[2]/div[2]/div/div[3]/div[1]/span/text()")
            if len(address_tag) == 0:
                address_tag = html.xpath("/html/body/div[5]/div[1]/div[2]/div/div[2]/div/span/text()")
            address = "".join(address_tag).strip()

            # 电话

            phone_tag = html.xpath("/html/body/div[4]/div[2]/div[1]/div[2]/div[4]/span[1]/text()")
            if len(phone_tag) == 0:
                phone_tag = html.xpath("/html/body/div[5]/div[2]/div[2]/div/p/span[2]/text()")
            if len(phone_tag) == 0:
                phone_tag = html.xpath("/html/body/div[4]/div[2]/div[1]/div[2]/div[3]/span[1]/text()")
            if len(phone_tag) == 0:
                phone_tag = html.xpath("/html/body/div[5]/div[1]/div[1]/div/div[3]/dl[2]/dd/a/@data-real")
            if len(phone_tag) == 0:
                phone_tag = html.xpath("/html/body/div[5]/div[1]/div[2]/div/p/span[2]/text()")

            if len(phone_tag) != 0:
                tele_phone = "".join(
                    phone_tag[0].strip().replace("\\n", "").replace("@@@@@@@@@@@@@@@@", ",").split(
                        "\n")).replace(" ", "")
            else:
                tele_phone = ''

            return {
                "shopAddress": address,
                "shopPhone": tele_phone
            }


if __name__ == "__main__":
    #
    Spider.run("ningbo", ['ch55', 'ch30', 'ch35', 'ch45', 'ch75', 'ch95', 'ch50'], 3)
